suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

add_genetype2 <- function(df) {
  
  df |> 
    mutate(
      genetype2 = case_when(
        gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
        gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
        .default = gene_type
      )
    )
  
}

calc_kmer_relative_position <- function(df) {
  
  df |> 
    left_join(espresso_AsPC1_transcript_seq_length |> select(-seq)) |> 
    mutate(
      rel_kmer_start  = kmer_start / length,
      rel_kmer_middle = kmer_middle / length,
      rel_kmer_end    = kmer_end / length
    )
  
}

Read data

DRS_methylated_positions <- 
  read_tsv(
    paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-10.tsv.gz')
  )
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions
## # A tibble: 605 × 65
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           43 ACACA                 1    
##  5 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  6 ENST00000389680.2 MT-RNR1-201           71 GTTCA                 1    
##  7 ENST00000389680.2 MT-RNR1-201           73 TCACC                 1    
##  8 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  9 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
## 10 ENST00000389680.2 MT-RNR1-201          138 GCTTA                 1    
## # ℹ 595 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
espresso_AsPC1_transcript_seq_length <- 
  read_tsv(
   '/Volumes/Mitsu_NGS_3/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1.transcripts.tsv', 
   col_names = c('transcript_id', 'seq', 'length')
  )
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, seq
## dbl (1): length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcript_seq_length |> 
  export_tsv()
## 
## Exported to: Tables/espresso_AsPC1_transcript_seq_length_2024-04-15.tsv
## # A tibble: 36,717 × 3
##    transcript_id      seq                                                 length
##    <chr>              <chr>                                                <dbl>
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCC…    987
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCC…   2252
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTG…    854
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAA…   6597
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACC…   5500
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGG…   4528
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTG…   2038
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCA…   2187
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGAC…   2203
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCT…    723
## # ℹ 36,707 more rows

Calculate range of kmer

DRS_methylated_positions_relative_range <- 
  DRS_methylated_positions |> 
  mutate(
    kmer_start = position + 1, kmer_end = position + 5, 
    kmer_middle = position + 3
  ) |> 
  select(transcript_id, gene_name, seqname, gene_type, contains('kmer')) |> 
  add_genetype2() |> 
  calc_kmer_relative_position()
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_relative_range |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-15.tsv
## # A tibble: 605 × 13
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACACA            44       48
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GTTCA            72       76
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      TCACC            74       78
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCTTA           139      143
## # ℹ 595 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
DRS_methylated_positions_relative_range$genetype2 |> unique()
## [1] "mRNA"    "Mt_rRNA" "mt-mRNA" NA

Plot

plot_distribution_DRS_relpositions_each_genetype2 <- function(genetype) {
  
  plot_basename <- paste0('DRS_m3Csites_distribution_', genetype)
  distribution_plot <- 
    DRS_methylated_positions_relative_range |> 
    filter(genetype2 == genetype) |> 
    ggplot(aes(x = rel_kmer_middle)) +
    geom_histogram(bins = 50)
  
  distribution_plot |> 
    ggsave_multiple_formats(
      width = 4, height = 2.5, fontsize = 7,
      basename = plot_basename, outdir = figdir
    )
}

plot_distribution_DRS_relpositions_each_genetype2('Mt_rRNA')

plot_distribution_DRS_relpositions_each_genetype2('mt-mRNA')

plot_distribution_DRS_relpositions_each_genetype2('mRNA')

plot_density_DRS_relpositions_each_genetype2 <- function(genetype) {
  
  plot_basename <- paste0('DRS_m3Csites_density_', genetype)
  distribution_plot <- 
    DRS_methylated_positions_relative_range |> 
    filter(genetype2 == genetype) |> 
    ggplot(aes(x = rel_kmer_middle)) +
    geom_density()
  print(distribution_plot)
  
  distribution_plot |> 
    ggsave_multiple_formats(
      width = 4, height = 2.5, fontsize = 7,
      basename = plot_basename, outdir = figdir
    )
}

c('Mt_rRNA', 'mt-mRNA', 'mRNA') |> 
  walk(plot_density_DRS_relpositions_each_genetype2)

distribution_plot_groupedby_genetype2 <- 
  DRS_methylated_positions_relative_range |> 
  filter(!is.na(genetype2)) |> 
  ggplot(aes(x = rel_kmer_middle)) +
  geom_histogram(bins = 50) +
  facet_wrap( ~ genetype2, scales = 'free_y', ncol = 1)
distribution_plot_groupedby_genetype2 |> 
  ggsave_multiple_formats(
    width = 3.5, height = 7, outdir = figdir
  )